library(gdata)
print(5)
setwd("/users/brucedesmarais/Dropbox/SenPE/Data/")
library(foreign)
sendat <- read.dta("senator_data.dta")

dat <- list()
 files <- c("form0304fnl.xls","form0506fnl.xls","form0708fnl.xls","form7980fnl.xls", "form8182fnl.xls","form8384fnl.xls", "form8586fnl.xls", "form8788fnl.xls", "form8990fnl.xls", "form9192fnl.xls", "form9394fnl.xls", "form9596fnl.xls", "form9798fnl.xls")


for(i in 1:length(files)){
	dat[[i]] <- read.xls(paste("./PEDat/",files[i],sep=""),stringsAsFactors=F, na.strings=c("NA",""))
}

# Names of senator columns in PE Data
scols <- paste("s",1:15,sep="")
# Congresses of dat
congs <- c(108,109,110,96+0:9)

# IDs for matching
id_master <- read.csv("id_master.csv",stringsAsFactors=F)


# Create the event Matrices

procName <- function(x){
	xvec <- character(length(x))
	for(i in 1:length(x)){
		chari <- x[i]
		comind <- which(strsplit(chari,split="")[[1]]==",")
		if(length(comind)==1){
		if(comind +1 < nchar(chari)){
			chari <- substr(chari,1,comind+2)
			}
			}
		xvec[i] <- chari
		}
		xvec
	}

mats <- list()

nattend <- NULL
ind <- 1
ifow <- c(1,4:13)
for(i in 1:length(dat)){
	dati <- dat[[i]]
	congi <- congs[i]
	mati <- NULL
	if(is.element(i,ifow)){
	idsi <- id_master[which(id_master[,1]==congi),5]
	idsi <- idsi[which(!is.na(idsi))]
	idsi <- procName(idsi)
	}
	if(!is.element(i,ifow)){
		idsi <- as.character(unique(c(as.matrix(dati[,scols]))))
		idsi <- idsi[which(!is.na(idsi))]
		}
	veci <- numeric(length(idsi))
	issue <- NULL
	for(j in 1:nrow(dati)){
		if(!is.na(dati$s1[j])){
			attend <- dati[j,scols]
			attend <- as.character(attend[which(!is.na(attend))])
			nattend <- c(nattend,length(attend))
			print(attend)
			attend <- procName(attend)
			vecij <- veci
			vecij[match(attend,idsi)] <- 1
			if(mean(vecij)!=0){
			mati <- rbind(mati,vecij)
			issue <- c(issue,dati$issue[j])
			}
			}
		}
		colnames(mati) <- idsi
		rownames(mati) <- issue
		mats[[ind]] <- mati
		ind <- ind+1
	}

wnets <- list()
topicNets <- list()
# Create Weighted Networks
for(i in 4:13){
	mati <- mats[[i]]
	neti <- matrix(0,ncol(mati),ncol(mati))
	tneti <- list()
	for(j in 1:nrow(neti)){
		tnetj <- list()
		for(k in (1:nrow(neti))[-j]){
			neti[j,k] <- sum(mati[,j]*mati[,k])
			ijrow <- which(mati[,j]*mati[,k]==1)
			tnetj[[k]] <- rownames(mati)[ijrow]
			}
			tneti[[j]] <- tnetj
		}
		colnames(neti) <- colnames(mati)
		rownames(neti) <- colnames(mati)
		wnets[[i-3]] <- neti
		topicNets[[i-3]] <- tneti
	}
	names(wnets) <- 96:105
	names(topicNets) <- 96:105

i <- 4
procName <- function(x){
	xvec <- character(length(x))
	for(i in 1:length(x)){
		chari <- x[i]
		comind <- which(strsplit(chari,split="")[[1]]==",")
		if(length(comind)==1){
		if(comind +1 < nchar(chari)){
			chari <- substr(chari,1,comind+2)
			}
			}
		xvec[i] <- chari
		}
		xvec
	}
mean(is.element(procName(colnames(mats[[i]])),procName(colnames(mats[[i+1]])))); i = i+1

unames <- NULL
for(i in 4:13){
	unames <- c(unames,procName(colnames(mats[[i]])))
	}
unames <- sort(unique(unames))

sumList <- list()
for(i in 1:length(unames)){
	sumList[[i]] <- list()
	for(j in 1:length(unames)){
		sumList[[i]][[j]] <- t(c(0,0))
		}
	}
	
for(i in 4:13){
	mati <- mats[[i]]
	namesi <- procName(colnames(mats[[i]]))
	for(j in 1:length(namesi)){
		for(k in (1:length(namesi))[-j]){
			namej <- namesi[j]
			namek <- namesi[k]
			indj <- which(unames==namej)
			indk <- which(unames==namek)
			ijrow <- which(mati[,j]*mati[,k]==1)
			sumsjk <- rownames(mati)[ijrow]
			if(length(sumsjk)>0){
			sumList[[indj]][[indk]] <- rbind(sumList[[indj]][[indk]],cbind(sumsjk,(96:105)[i-3]))
			}
			}
		}
	}

names(sumList) <- unames
for(i in 1:length(sumList)){
	names(sumList[[i]]) <- unames
	}

first <- NULL
last <- NULL
for(i in 1:length(unames)){
	for(j in (1:length(unames))[-i]){
		sumsij <- sumList[[i]][[j]][,1]
		if(length(sumsij)>4){
		sumsij <- sumsij[2:length(sumsij)]
		half <- round(length(sumsij)/2)
		sumsij1 <- sumsij[1:half]
		sumsij2 <- sumsij[(half+1):length(sumsij)]
		sumsij1 <- paste(sumsij1,collapse=" ")
		sumsij2 <- paste(sumsij2,collapse=" ")
		first <- c(first,sumsij1)
		last <- c(last,sumsij2)
		}
		}
	}
	
# Find Multibyte strings
iserr1 <- numeric(length(first))
iserr2 <- numeric(length(first))
for(i in 1:length(first)){
	iserr1[i] <- class(try(tolower(first[i])))
	iserr2[i] <- class(try(tolower(last[i])))
	}	
	
first <- first[which(((iserr1=="try-error") + (iserr2=="try-error"))==0)]
last <- last[which(((iserr1=="try-error") + (iserr2=="try-error"))==0)]	
	
library(tm)
FirstCorp <- Corpus(VectorSource(first))
LastCorp <- Corpus(VectorSource(last))

firstDTM <- DocumentTermMatrix(FirstCorp)
lastDTM <- DocumentTermMatrix(LastCorp)
library(topicmodels)
### Estimate Topic Model for First Half

nterms <- apply(firstDTM,1,sum)
firstDTM <- firstDTM[which(nterms>0),]

set.seed(5)
train <- sample(1:nrow(firstDTM),round(nrow(firstDTM)/2))
test <- (1:nrow(firstDTM))[-train]

#ntops <- seq(20,60,by=5)
#firstCVLL <- numeric(length(ntops))
#for(i in 1:length(ntops)){
#firstTopic <- LDA(firstDTM[train,],ntops[i])
#ft_inf <- LDA(firstDTM[test,],model=firstTopic, control=list(estimate.beta=F))
#firstCVLL[i] <- logLik(ft_inf)
#print(ntops[i])
#}

firstTopic <- LDA(firstDTM,50)
firstPost <- posterior(firstTopic)
topicLik1 <- apply(firstPost$topics,2,mean)
ordTop1 <- order(-topicLik1)
terms(firstTopic,5)[,ordTop1[1:10]]

### Estimate Topic Model for Second Half

nterms <- apply(lastDTM,1,sum)
lastDTM <- lastDTM[which(nterms>0),]

lastTopic <- LDA(lastDTM,50)
lastPost <- posterior(lastTopic)
topicLik2 <- apply(lastPost$topics,2,mean)
ordTop2 <- order(-topicLik2)
terms(lastTopic,5)[,ordTop2[1:10]]

# Difference in Proportions for Terms
allCorp <- Corpus(VectorSource(c(first,last)))
allDTM <- DocumentTermMatrix(allCorp)

difProp <-  apply(allDTM[(length(first)+1):(2*length(first)),],2,mean) - apply(allDTM[1:length(first),],2,mean)

prop2 <- apply(allDTM[(length(first)+1):(2*length(first)),],2,mean)
prop1 <- apply(allDTM[1:length(first),],2,mean)

allDTM <- allDTM[,which((prop1 > .05) + (prop2 > .05) >1)]

difProp <-  apply(allDTM[(length(first)+1):(2*length(first)),],2,mean) - apply(allDTM[1:length(first),],2,mean)

prop2 <- apply(allDTM[(length(first)+1):(2*length(first)),],2,mean)
prop1 <- apply(allDTM[1:length(first),],2,mean)


top25first <- order(difProp)[1:25]
top25last <- order(-difProp)[1:25]

cbind(colnames(allDTM)[top25first],prop1[top25first],prop2[top25first],difProp[top25first])
cbind(colnames(allDTM)[top25last],prop1[top25last],prop2[top25last],difProp[top25last])

## Chi-Square Assessment of Fit ##
second <- numeric(nrow(allDTM))
second[(length(first)+1):(2*length(first))] <- 1
stats <- rep(1,ncol(allDTM))
pvals <- rep(1,ncol(allDTM))
for(i in 1:ncol(allDTM)){
	term <- c(allDTM[,i])
	if(sum(term)>20){
	testi <- chisq.test(second,term)
	stats[i] <- testi$statistic
	pvals[i] <- testi$p.value
	}
	}

top20 <- order(pvals)[1:20]

topTab <- cbind(colnames(allDTM)[top20],prop1[top20],prop2[top20],difProp[top20])

# Take out generic terms
out <- match(c("the","and","with","for"),topTab[,1])
topTab <- topTab[-out,]

# Just make it 15
topTab <- topTab[1:15,]

topTab <- data.frame(topTab)
topTab[,2:4] <- as.numeric(as.matrix(topTab[,2:4]))

xtable(topTab[order(topTab[,4]),2:4]*100,digit=3)

# Tag Clouds
library(wordcloud)
words1 <- colnames(firstDTM)
words2 <- colnames(lastDTM)
freq1 <- apply(firstDTM,2,sum)
freq2 <- apply(lastDTM,2,sum)



